{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Azure Cognitive Search"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Basic Example\n",
    "\n",
    "In this basic example, we take  a Paul Graham essay, split it into chunks, embed it using an OpenAI embedding model, load it into an Azure Cognitive Search index, and then query it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "import sys\n",
    "from IPython.display import Markdown, display\n",
    "\n",
    "# logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
    "# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n",
    "\n",
    "# logger = logging.getLogger(__name__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!{sys.executable} -m pip install llama-index\n",
    "#!{sys.executable} -m pip install azure-search-documents==11.4.0b8\n",
    "#!{sys.executable} -m pip install azure-identity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set up OpenAI\n",
    "import os\n",
    "import getpass\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")\n",
    "import openai\n",
    "\n",
    "openai.api_key = os.environ[\"OPENAI_API_KEY\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set up Azure Cognitive Search\n",
    "from azure.search.documents.indexes import SearchIndexClient\n",
    "from azure.search.documents import SearchClient\n",
    "from azure.core.credentials import AzureKeyCredential\n",
    "\n",
    "search_service_name = getpass.getpass(\"Azure Cognitive Search Service Name\")\n",
    "\n",
    "key = getpass.getpass(\"Azure Cognitive Search Key\")\n",
    "\n",
    "cognitive_search_credential = AzureKeyCredential(key)\n",
    "\n",
    "service_endpoint = f\"https://{search_service_name}.search.windows.net\"\n",
    "\n",
    "# Index name to use\n",
    "index_name = \"quickstart\"\n",
    "\n",
    "# Use index client to demonstrate creating an index\n",
    "index_client = SearchIndexClient(\n",
    "    endpoint=service_endpoint,\n",
    "    credential=cognitive_search_credential,\n",
    ")\n",
    "\n",
    "# Use search client to demonstration using existing index\n",
    "search_client = SearchClient(\n",
    "    endpoint=service_endpoint,\n",
    "    index_name=index_name,\n",
    "    credential=cognitive_search_credential,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create Index (if it does not exist)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Demonstrates creating a vector index named quickstart01 if one doesn't exist. The index has the following fields:\n",
    "- id (Edm.String)\n",
    "- content (Edm.String)\n",
    "- embedding (Edm.SingleCollection)\n",
    "- li_jsonMetadata (Edm.String)\n",
    "- li_doc_id (Edm.String)\n",
    "- author (Edm.String)\n",
    "- theme (Edm.String)\n",
    "- director (Edm.String)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from azure.search.documents import SearchClient\n",
    "from llama_index.vector_stores import CognitiveSearchVectorStore\n",
    "from llama_index.vector_stores.cogsearch import (\n",
    "    IndexManagement,\n",
    "    MetadataIndexFieldType,\n",
    "    CognitiveSearchVectorStore,\n",
    ")\n",
    "\n",
    "# Example of a complex mapping, metadata field 'theme' is mapped to a differently name index field 'topic' with its type explicitly set\n",
    "metadata_fields = {\n",
    "    \"author\": \"author\",\n",
    "    \"theme\": (\"topic\", MetadataIndexFieldType.STRING),\n",
    "    \"director\": \"director\",\n",
    "}\n",
    "\n",
    "# A simplified metadata specification is available if all metadata and index fields are similarly named\n",
    "# metadata_fields = {\"author\", \"theme\", \"director\"}\n",
    "\n",
    "\n",
    "vector_store = CognitiveSearchVectorStore(\n",
    "    search_or_index_client=index_client,\n",
    "    index_name=index_name,\n",
    "    filterable_metadata_field_keys=metadata_fields,\n",
    "    index_management=IndexManagement.CREATE_IF_NOT_EXISTS,\n",
    "    id_field_key=\"id\",\n",
    "    chunk_field_key=\"content\",\n",
    "    embedding_field_key=\"embedding\",\n",
    "    metadata_string_field_key=\"li_jsonMetadata\",\n",
    "    doc_id_field_key=\"li_doc_id\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define embedding function\n",
    "from llama_index.embeddings import OpenAIEmbedding\n",
    "from llama_index import (\n",
    "    SimpleDirectoryReader,\n",
    "    StorageContext,\n",
    "    ServiceContext,\n",
    "    VectorStoreIndex,\n",
    ")\n",
    "\n",
    "embed_model = OpenAIEmbedding()\n",
    "\n",
    "# load documents\n",
    "documents = SimpleDirectoryReader(\n",
    "    \"../../../examples/paul_graham_essay/data\"\n",
    ").load_data()\n",
    "\n",
    "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
    "service_context = ServiceContext.from_defaults(embed_model=embed_model)\n",
    "index = VectorStoreIndex.from_documents(\n",
    "    documents, storage_context=storage_context, service_context=service_context\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "<b>The author wrote short stories and programmed on an IBM 1401 computer during their time in school. They later got their own microcomputer, a TRS-80, and started programming games and a word processor.</b>"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Query Data\n",
    "query_engine = index.as_query_engine(similarity_top_k=3)\n",
    "response = query_engine.query(\"What did the author do growing up?\")\n",
    "display(Markdown(f\"<b>{response}</b>\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "<b>The author learned several things during their time at Interleaf. They learned that it's better for technology companies to be run by product people than sales people, that code edited by too many people leads to bugs, that cheap office space is not worth it if it's depressing, that planned meetings are inferior to corridor conversations, that big bureaucratic customers can be a dangerous source of money, and that there's not much overlap between conventional office hours and the optimal time for hacking. However, the most important thing the author learned is that the low end eats the high end, meaning that it's better to be the \"entry level\" option because if you're not, someone else will be and will surpass you.</b>"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "response = query_engine.query(\n",
    "    \"What did the author learn?\",\n",
    ")\n",
    "display(Markdown(f\"<b>{response}</b>\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use Existing Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.vector_stores import CognitiveSearchVectorStore\n",
    "from llama_index.vector_stores.cogsearch import (\n",
    "    IndexManagement,\n",
    "    MetadataIndexFieldType,\n",
    "    CognitiveSearchVectorStore,\n",
    ")\n",
    "\n",
    "\n",
    "index_name = \"quickstart\"\n",
    "\n",
    "metadata_fields = {\n",
    "    \"author\": \"author\",\n",
    "    \"theme\": (\"topic\", MetadataIndexFieldType.STRING),\n",
    "    \"director\": \"director\",\n",
    "}\n",
    "vector_store = CognitiveSearchVectorStore(\n",
    "    search_or_index_client=search_client,\n",
    "    filterable_metadata_field_keys=metadata_fields,\n",
    "    index_management=IndexManagement.NO_VALIDATION,\n",
    "    id_field_key=\"id\",\n",
    "    chunk_field_key=\"content\",\n",
    "    embedding_field_key=\"embedding\",\n",
    "    metadata_string_field_key=\"li_jsonMetadata\",\n",
    "    doc_id_field_key=\"li_doc_id\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define embedding function\n",
    "from llama_index.embeddings import OpenAIEmbedding\n",
    "from llama_index import (\n",
    "    SimpleDirectoryReader,\n",
    "    StorageContext,\n",
    "    ServiceContext,\n",
    "    VectorStoreIndex,\n",
    ")\n",
    "\n",
    "embed_model = OpenAIEmbedding()\n",
    "\n",
    "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
    "service_context = ServiceContext.from_defaults(embed_model=embed_model)\n",
    "index = VectorStoreIndex.from_documents(\n",
    "    [], storage_context=storage_context, service_context=service_context\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "<b>The author experienced a difficult moment when their mother had a stroke and was put in a nursing home. The stroke destroyed her balance, and the author and their sister were determined to help her get out of the nursing home and back to her house.</b>"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "query_engine = index.as_query_engine()\n",
    "response = query_engine.query(\"What was a hard moment for the author?\")\n",
    "display(Markdown(f\"<b>{response}</b>\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "<b>The author of the given context is Paul Graham.</b>"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "response = query_engine.query(\"Who is the author?\")\n",
    "display(Markdown(f\"<b>{response}</b>\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "At Interleaf, there was a group called Release Engineering that seemed to be as big as the group that actually wrote the software. The software at Interleaf had to be updated on the server, and there was a lot of emphasis on high production values to make the online store builders look legitimate.\n",
      "\n",
      "Streamed output at 20.953424485215063 tokens/s\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "\n",
    "query_engine = index.as_query_engine(streaming=True)\n",
    "response = query_engine.query(\"What happened at interleaf?\")\n",
    "\n",
    "start_time = time.time()\n",
    "\n",
    "token_count = 0\n",
    "for token in response.response_gen:\n",
    "    print(token, end=\"\")\n",
    "    token_count += 1\n",
    "\n",
    "time_elapsed = time.time() - start_time\n",
    "tokens_per_second = token_count / time_elapsed\n",
    "\n",
    "print(f\"\\n\\nStreamed output at {tokens_per_second} tokens/s\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Adding a document to existing index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "<b>The color of the sky can vary depending on various factors such as time of day, weather conditions, and location. It can range from shades of blue during the day to hues of orange, pink, and purple during sunrise or sunset.</b>"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "response = query_engine.query(\"What colour is the sky?\")\n",
    "display(Markdown(f\"<b>{response}</b>\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index import Document\n",
    "\n",
    "index.insert_nodes([Document(text=\"The sky is indigo today\")])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "<b>The colour of the sky is indigo.</b>"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "response = query_engine.query(\"What colour is the sky?\")\n",
    "display(Markdown(f\"<b>{response}</b>\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Filtering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.schema import TextNode\n",
    "\n",
    "nodes = [\n",
    "    TextNode(\n",
    "        text=\"The Shawshank Redemption\",\n",
    "        metadata={\n",
    "            \"author\": \"Stephen King\",\n",
    "            \"theme\": \"Friendship\",\n",
    "        },\n",
    "    ),\n",
    "    TextNode(\n",
    "        text=\"The Godfather\",\n",
    "        metadata={\n",
    "            \"director\": \"Francis Ford Coppola\",\n",
    "            \"theme\": \"Mafia\",\n",
    "        },\n",
    "    ),\n",
    "    TextNode(\n",
    "        text=\"Inception\",\n",
    "        metadata={\n",
    "            \"director\": \"Christopher Nolan\",\n",
    "        },\n",
    "    ),\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "index.insert_nodes(nodes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[NodeWithScore(node=TextNode(id_='5a97da0c-8f04-4c63-b90b-8c474d8c273d', embedding=None, metadata={'director': 'Francis Ford Coppola', 'theme': 'Mafia'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='81cf4b9e847ba42e83fc401e31af8e17d629f0d5cf9c0c320ec7ac69dd0257e1', text='The Godfather', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'), score=0.81316805)]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters\n",
    "\n",
    "\n",
    "filters = MetadataFilters(filters=[ExactMatchFilter(key=\"theme\", value=\"Mafia\")])\n",
    "\n",
    "retriever = index.as_retriever(filters=filters)\n",
    "retriever.retrieve(\"What is inception about?\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llamaindextest01",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
